import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.express as px
from plotly import graph_objects as go
from pylab import rcParams
import sklearn
from sklearn.cluster import KMeans
plt.style.use('fivethirtyeight')
%matplotlib inline
# rcParams['figure.figsize'] = 15,6
df = pd.read_csv('./Data/Mall_Customers.csv')
df.head()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 |
| 1 | 2 | Male | 21 | 15 | 81 |
| 2 | 3 | Female | 20 | 16 | 6 |
| 3 | 4 | Female | 23 | 16 | 77 |
| 4 | 5 | Female | 31 | 17 | 40 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 200 non-null int64 1 Gender 200 non-null object 2 Age 200 non-null int64 3 Annual Income (k$) 200 non-null int64 4 Spending Score (1-100) 200 non-null int64 dtypes: int64(4), object(1) memory usage: 7.9+ KB
# Some visualization of features
plt.figure(figsize = (12, 5))
n = 1
for i in ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']:
plt.subplot(1, 3, n)
# plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
sns.histplot(df[i], bins = 20)
plt.title('Distribution of {}'.format(i))
n += 1
plt.show()
plt.figure(figsize = (6,4))
sns.countplot(x = 'Gender', data = df, palette= 'hls')
<AxesSubplot:xlabel='Gender', ylabel='count'>
sns.pairplot(df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']])
# we can see that the 'Annual Income (k$)', 'Spending Score (1-100)'
# features are showing the clustering behavior. Age and Spending Score (1-100) are
# also showing some clustering behavior.
#### We will do the clustering with two subsets of the features.
#### i) Annual Income (k$) & Spending Score (1-100)
#### ii) Annual Income (k$), Spending Score (1-100) & Age
<seaborn.axisgrid.PairGrid at 0x15dbab910>
# Choosing features for clustering
X1 = df[['Annual Income (k$)', 'Spending Score (1-100)']].values
wcss = []
for i in range(1, 11):
model = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
model.fit(X1)
wcss.append(model.inertia_)
# Plotting the elbow graph
plt.figure(figsize = (7, 5))
plt.plot(wcss, marker = 'o', alpha = 0.5)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('The Elbow Point Graph')
plt.show()
# We can see that cluster number should be 4 or 5. To be more accurate we should
# evaluate the model score using Silhouette score for different clustering size.
from sklearn.metrics import silhouette_score
for i in range(3, 7):
model = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
model.fit(X1)
score = silhouette_score(X1, model.labels_)
print("Silhouette score for K = ", i, ": {:.3f}".format(score))
# We can see that, the silhouette score is heighst for K = 5
Silhouette score for K = 3 : 0.468 Silhouette score for K = 4 : 0.493 Silhouette score for K = 5 : 0.554 Silhouette score for K = 6 : 0.540
model = KMeans(n_clusters= 5, init = 'k-means++')
model.fit(X1)
# print(model.get_params())
KMeans(n_clusters=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=5)
plt.figure(figsize = (7, 6))
# Create scatter plot with different colors for each cluster
fig = px.scatter(x=X1[:,0], y=X1[:,1], color = model.labels_, width = 800, height = 400)
fig.update_layout(xaxis_title = 'Annual Income (K$)', yaxis_title = 'Spending Score (1 - 100)',
title = 'KMeans Clustering Results', title_x=0.5, title_y=0.95, coloraxis_showscale=False)
# Add cluster centroids as black X
centroids = model.cluster_centers_
fig.add_scatter(x=centroids[:, 0], y=centroids[:, 1],
mode='markers', marker=dict(size=10, color='black', symbol='x'))
# Show plot
fig.show()
<Figure size 504x432 with 0 Axes>
X2 = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].values
wcss2 = []
for i in range(1, 11):
model = KMeans(n_clusters= i, init = 'k-means++', random_state= 42)
model.fit(X2)
wcss2.append(model.inertia_)
plt.figure(figsize=(7,5))
plt.plot(wcss2, marker = 'o', alpha = 0.5)
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('The Elbow Point Graph ')
plt.show()
# We can see that cluster number could be 4, 5 or 6. To be more accurate we should
# evaluate the model score using Silhouette score for different clustering size.
for i in range(4, 8):
model = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
model.fit(X2)
score = silhouette_score(X2, model.labels_)
print("silhouette_score for K = ", i, ": {:.3f}".format(score))
# We can see that, the silhouette score is heighst for K = 6,
# but it's not much higher than K = 5. So, we will use K = 5.
silhouette_score for K = 4 : 0.405 silhouette_score for K = 5 : 0.444 silhouette_score for K = 6 : 0.452 silhouette_score for K = 7 : 0.440
model = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
model.fit(X2)
# print(model.get_params())
KMeans(n_clusters=5, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=5, random_state=42)
# label2 = model.labels_
# centroids2 = model.cluster_centers_
# trace1 = go.Scatter3d(x= df['Age'],
# y= df['Spending Score (1-100)'],
# z= df['Annual Income (k$)'],
# mode='markers', marker=dict(color = label2, size= 20,
# line=dict(color= label2, width= 12),opacity=0.8))
# trace2 = go.Scatter3d(x=centroids2[:,0],
# y=centroids2[:,1],
# z=centroids2[:,2],
# mode='markers', marker=dict(color='black', size = 10, symbol = 'circle'))
# data = [trace1, trace2]
# layout = go.Layout(title= 'Clusters', title_x=0.5, title_y=0.95,
# scene = dict(
# xaxis = dict(title = 'Age'),
# yaxis = dict(title = 'Spending Score'),
# zaxis = dict(title = 'Annual Income')),
# margin = dict(l = 10, r = 10, b = 10, t = 10))
# fig = go.Figure(data=data, layout=layout)
# py.offline.iplot(fig)
import plotly.express as px
label2 = model.labels_
centroids2 = model.cluster_centers_
df['labels'] = label2
fig = px.scatter_3d(df, x='Age', y='Spending Score (1-100)',
z='Annual Income (k$)', color='labels',
opacity = 0.6, width = 1000, height = 600)
fig.update_traces(marker_size = 15)
fig.add_scatter3d(x=centroids2[:,0], y=centroids2[:,1], z=centroids2[:,2],
mode='markers', marker=dict(size=10, color='black', symbol='circle'))
fig.update_layout(title = 'KMeans Clustering Results', title_x=0.5,
title_y=0.95, coloraxis_showscale=False)
fig.show()